import warnings
warnings.filterwarnings('ignore')
import json
import urllib3
import time
import urllib.request
import pandas as pd
from pandas.io.json import json_normalize
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['savefig.bbox'] = 'tight'
plt.rcParams['savefig.pad_inches'] = 0
plt.rcParams['savefig.format'] = 'pdf'
plt.rcParams['legend.frameon'] = True
#pd.set_option('display.max_rows', 50)
#pd.set_option('display.max_columns', 50)
import seaborn as sns
sns.set_context('notebook')
sns.set_style('whitegrid')
sns.set_palette('deep')
Firstly we look at the number of nodes, number of edges and average degree for each window size for the whole period. Note that "number of nodes" means number of individuals who were involved in at least one interaction within the relevant period.
def fullPlot(toPlot,title,x,y,scale,start,end):
windows = [31536000000,2592000000,604800000, 86400000]
labels = ['Year Window','Month Window','Week Window','Day Window']
a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
#plt.title(title,fontsize=30)
plt.xlabel(x,fontsize=30)
plt.ylabel(y,fontsize=30)
ax.set_yscale(scale)
with open('degree/degrees.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x['time'] = pd.to_datetime(x['time'],unit='ms')
x['index'] = pd.to_datetime(x['time'],unit='ms')
x.set_index('index', inplace=True)
x =x[start:end]
x['avgdeg']= np.where(x['vertices']<1, x['vertices'], 2*x['edges']/x['vertices'])
x = x[x[toPlot] != 0]
x.plot(x='time', y=toPlot,ax=ax, label="Aggregate Graph")
with open('degree/degreewindows.json') as json_file:
with open('degree/degreesorted.json') as json_2:
cc1 = json.load(json_file)
cc2 = json.load(json_2)
cc1= pd.DataFrame(cc1['views'])
cc2= pd.DataFrame(cc2['views'])
cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
cc2['time'] = pd.to_datetime(cc2['time'],unit='ms')
cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
cc2['index'] = pd.to_datetime(cc2['time'],unit='ms')
cc1.set_index('index', inplace=True)
cc2.set_index('index', inplace=True)
cc1=cc1[start:end]
cc2=cc2[start:end]
index = 0
for i in windows:
color=next(ax._get_lines.prop_cycler)['color']
y = cc1[cc1['windowsize'] == i]
z = cc2[cc2['windowsize'] == i]
y['avgdeg']= np.where(y['vertices']<1, y['vertices'], 2*y['edges']/y['vertices'])
z['avgdeg']= np.where(z['vertices']<1, z['vertices'], 2*z['edges']/z['vertices'])
#y = y[y[toPlot] != 0]
y.plot(x='time', y=toPlot,ax=ax, label=labels[index], color = color)
#z.plot(x='time', y=toPlot,ax=ax, label='_nolegend_', color = color, linestyle="--", alpha=0.8)
index +=1
with open('degree/degreehours.json') as json_file:
with open('degree/degreesortedhour.json') as json_2:
color=next(ax._get_lines.prop_cycler)['color']
x = json.load(json_file)
y = json.load(json_2)
x= pd.DataFrame(x['views'])
y= pd.DataFrame(y['views'])
x['index'] = pd.to_datetime(x['time'],unit='ms')
y['index'] = pd.to_datetime(y['time'],unit='ms')
x['time'] = pd.to_datetime(x['time'],unit='ms')
y['time'] = pd.to_datetime(y['time'],unit='ms')
x.set_index('index', inplace=True)
y.set_index('index', inplace=True)
x =x[start:end]
y =y[start:end]
x['avgdeg']= np.where(x['vertices']<1, x['vertices'], 2*x['edges']/x['vertices'])
y['avgdeg']= np.where(y['vertices']<1, y['vertices'], 2*y['edges']/y['vertices'])
x['hour'] = x.apply(lambda row: int(row['time'].hour),axis=1)
y['hour'] = y.apply(lambda row: int(row['time'].hour),axis=1)
x = x[x[toPlot] != 0]
y = y[y[toPlot] != 0]
x.plot(x='time', y=toPlot,ax=ax, color=color, label="Hour window")
#y.plot(x='time', y=toPlot,ax=ax, color=color, label='_nolegend_',linestyle="--", alpha=0.8)
#plt.axvline('2016-11-09',linestyle='')
#plt.axvline('2017-08-11')
#plt.axvline('2017-08-13')
plt.legend(fontsize=20, loc='upper left')
plt.xlabel('Date',fontsize=30)
plt.rc('xtick',labelsize=20)
plt.rc('ytick',labelsize=20)
plt.savefig('graphs/'+toPlot+'.png')
plt.show()
def plot_window_scale(toPlot, title, y, no_parts=3):
a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
#plt.title(title,fontsize=30)
plt.xlabel("Window Size",fontsize=30)
plt.ylabel(y,fontsize=30)
ax.set_xscale('log')
ax.set_yscale('log')
windows = np.array([3600000, 86400000, 604800000, 2592000000, 31536000000])
means = np.zeros((no_parts,5),dtype=float)
sds = np.zeros((no_parts,5),dtype=float)
with open('degree/degreewindows.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x['time'] = pd.to_datetime(x['time'],unit='ms')
x['index'] = pd.to_datetime(x['time'],unit='ms')
x.set_index('index', inplace=True)
x['avgdeg']= np.where(x['vertices']<1, x['vertices'], 2*x['edges']/x['vertices'])
x = x[x[toPlot] != 0]
for j in range(1,5):
z = x[x['windowsize']==round(windows[j])]
dfs = np.array_split(z,no_parts)
for i in range(no_parts):
df = dfs[i]
means[i,j]=df[toPlot].mean()
sds[i,j]=df[toPlot].std()
with open('degree/degreehours.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x['time'] = pd.to_datetime(x['time'],unit='ms')
x['index'] = pd.to_datetime(x['time'],unit='ms')
x.set_index('index', inplace=True)
x['avgdeg']= np.where(x['vertices']<1, x['vertices'], 2*x['edges']/x['vertices'])
x = x[x[toPlot] != 0]
dfs = np.array_split(x,no_parts)
for i in range(no_parts):
df = dfs[i]
means[i,0]=df[toPlot].mean()
sds[i,0]=df[toPlot].std()
for i in range(no_parts):
plt.plot(windows, means[i], label = "Period "+str(i+1),marker='^')
plt.fill_between(windows, means[i]-1.96*sds[i], means[i]+1.96*sds[i], alpha = 0.3)
plt.rc('xtick',labelsize=20)
plt.rc('ytick',labelsize=20)
plt.legend(loc='upper left',fontsize=20)
plt.show()
First we examine how the number of nodes is reported according to each window size. We see that:
fullPlot('vertices','Total Number of Vertices','Date','Number of Vertices','linear','2016-09-30 23:00:00','2018-10-30')
plot_window_scale('vertices', "Mean number of Nodes", "Mean number of Nodes", 5)
Not much to say here apart from, again, the hugely differing scales of looking at a month vs a year.
fullPlot('edges', 'Total Number of Edges', 'Date', 'Edges', 'linear','2016-09-30 23:00:00','2018-10-30')
plot_window_scale('edges', "Mean number of Edges", "Mean number of Edges", 5)
Calculated as 2x|edges|/|vertices|. We see that:
fullPlot('avgdeg', 'Average Degree', 'Date', 'Average Degree', 'linear','2016-09-30 23:00:00','2018-10-30')
plot_window_scale('avgdeg', "Average Degree", "Average Degree", 5)
from matplotlib import gridspec
def plotNewVsExisting(toPlot,start,end,y,scale):
windows = [31536000000,2592000000,604800000, 86400000]
labels = ['Year Window','Month Window','Week Window','Day Window']
a4_dims = (11.7, 8.27)
fig = plt.figure(figsize=a4_dims)
gs = gridspec.GridSpec(2, 1, height_ratios=[1, 2])
ax0 = fig.add_subplot(gs[0])
ax0.set_ylim((0,1))
ax1 = fig.add_subplot(gs[1],sharex=ax0)
plt.xlabel("Date",fontsize=30)
plt.ylabel(y,fontsize=20)
ax0.set_yscale(scale)
with open('degree/degrees.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x['time'] = pd.to_datetime(x['time'],unit='ms')
x['index'] = pd.to_datetime(x['time'],unit='ms')
x.set_index('index', inplace=True)
aggr =x[start:end]
aggr.plot(x='time', y=toPlot,ax=ax1, label="Aggregate Graph")
with open('degree/degreewindows.json') as json_file:
with open('degree/degreesorted.json') as json_2:
cc1 = json.load(json_file)
cc2 = json.load(json_2)
cc1= pd.DataFrame(cc1['views'])
cc2= pd.DataFrame(cc2['views'])
cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
cc2['time'] = pd.to_datetime(cc2['time'],unit='ms')
cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
cc2['index'] = pd.to_datetime(cc2['time'],unit='ms')
cc1.set_index('index', inplace=True)
cc2.set_index('index', inplace=True)
cc1=cc1[start:end]
cc2=cc2[start:end]
index = 0
for i in windows:
diff_size = int(i/86400000);
color=next(ax1._get_lines.prop_cycler)['color']
y = cc1[cc1['windowsize'] == i]
z = cc2[cc2['windowsize'] == i]
y['avgdeg']= np.where(y['vertices']<1, y['vertices'], 2*y['edges']/y['vertices'])
y['new']= aggr[toPlot].diff(diff_size)
y['prop_new'] = np.where(y[toPlot]<1,1,y['new']/y[toPlot])
y = y[y[toPlot] != 0]
y.plot(x='time', y=toPlot,ax=ax1, label=labels[index], color = color)
y.plot(x='time', y="prop_new",ax=ax0, label=labels[index], color = color)
#z.plot(x='time', y=toPlot,ax=ax, label='_nolegend_', color = color, linestyle="--", alpha=0.8)
index +=1
with open('degree/degreehours.json') as json_file:
with open('degree/degreesortedhour.json') as json_2:
color=next(ax1._get_lines.prop_cycler)['color']
x = json.load(json_file)
y = json.load(json_2)
x= pd.DataFrame(x['views'])
y= pd.DataFrame(y['views'])
x['index'] = pd.to_datetime(x['time'],unit='ms')
y['index'] = pd.to_datetime(y['time'],unit='ms')
x['time'] = pd.to_datetime(x['time'],unit='ms')
y['time'] = pd.to_datetime(y['time'],unit='ms')
x.set_index('index', inplace=True)
y.set_index('index', inplace=True)
x =x[start:end]
y =y[start:end]
#x['prop_new'] = np.where(x[toPlot]<1,1,x['new']/x[toPlot])
x['avgdeg']= np.where(x['vertices']<1, x['vertices'], 2*x['edges']/x['vertices'])
y['avgdeg']= np.where(y['vertices']<1, y['vertices'], 2*y['edges']/y['vertices'])
x['hour'] = x.apply(lambda row: int(row['time'].hour),axis=1)
y['hour'] = y.apply(lambda row: int(row['time'].hour),axis=1)
x = x[x[toPlot] != 0]
y = y[y[toPlot] != 0]
x.plot(x='time', y=toPlot,ax=ax1, color=color, label="Hour window")
#x.plot(x='time', y="prop_new",ax=ax0, color=color, label="Hour window")
#y.plot(x='time', y=toPlot,ax=ax, color=color, label='_nolegend_',linestyle="--", alpha=0.8)
#plt.axvline('2016-11-09',linestyle='')
#plt.axvline('2017-08-11')
#plt.axvline('2017-08-13')
ax0.get_legend().remove()
ax0.set_title("Proportion of "+toPlot+" that are new", fontsize=20)
ax1.legend(fontsize=15, loc='upper left')
plt.xlabel('Date',fontsize=30)
plt.rc('xtick',labelsize=20)
plt.rc('ytick',labelsize=20)
plt.savefig('graphs/'+toPlot+'.png')
plt.show()
plotNewVsExisting("vertices","2016-09-30 23:00:00","2018-10-30","Number of Vertices","linear")
plotNewVsExisting("edges","2016-09-30 23:00:00","2018-10-30","Number of Edges","linear")
This section contains the analysis of the size/proportion of the largest connected components, as well as the number of connected components. In the proportion and number, we exclude components comprising just one edge (two nodes) from the calculations.
def fullPlot(toPlot,title,x,y,scale,start,end):
windows = [31536000000,2592000000,604800000,86400000]
labels = ['Year Window','Month Window','Week Window','Day Window']
a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
#plt.title(title,fontsize=30)
plt.xlabel(x,fontsize=30)
plt.ylabel(y,fontsize=30)
ax.set_yscale(scale)
with open('CC/nowindow.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x['time'] = pd.to_datetime(x['time'],unit='ms')
x['index'] = pd.to_datetime(x['time'],unit='ms')
x.set_index('index', inplace=True)
x =x[start:end]
x = x[x['total'] != 0]
x['mean'] = x[toPlot].rolling(window=4,center=False).mean()
x.plot(x='time', y=toPlot,ax=ax, label="Aggregate Graph")
with open('CC/bigCC.json') as json_file:
cc1 = json.load(json_file)
cc1= pd.DataFrame(cc1['views'])
cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
cc1.set_index('index', inplace=True)
cc1=cc1[start:end]
cc1 = cc1[cc1[toPlot] != 0]
index = 0
for i in windows:
y = cc1[cc1['windowsize'] == i]
y['mean'] = y[toPlot].rolling(window=4,center=False).mean()
hours = str(int((i/3600000)))+" hour window"
y.plot(x='time', y=toPlot,ax=ax, label=labels[index])
index +=1
with open('CC/cc1hour.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x['index'] = pd.to_datetime(x['time'],unit='ms')
x['time'] = pd.to_datetime(x['time'],unit='ms')
x.set_index('index', inplace=True)
x =x[start:end]
#x = x[x['hour'].isin([4,16])]
x.plot(x='time', y=toPlot,ax=ax, color="#f0134d", label="Hour window")
ax.legend(fontsize=20,framealpha=0.9,loc='upper left')
plt.tight_layout()
plt.rc('xtick',labelsize=20)
plt.rc('ytick',labelsize=20)
plt.setp(ax.xaxis.get_majorticklabels(), rotation=30)
plt.xlabel("Date")
plt.savefig("Graphs/"+toPlot+"partial.png")
plt.show()
partial_dates = ['2017-09-30 23:00:00','2017-10-30']
full_dates = ['2016-09-30 23:00:00','2018-10-30']
#x['mean'] = x[toPlot].rolling(window=24,center=False).mean()
#[0,4,8,12,16,20,24]
#'2016-07-01': '2018-05-03'
We measure the size of the largest connected component as a proportion of the size of the whole graph for that window.
We find that:
fullPlot('proportionWithoutIslands','Largest Connected Component % of Graph','Date','Proportion of Graph','linear','2016-09-30 23:00:00','2018-10-30')
When zooming in to just a month subset of data, we see that the hourly window size is actually showing diurnal behaviour which we will explore later.
fullPlot('proportionWithoutIslands','Largest Connected Component % of Graph','Date','Proportion of Graph','linear','2017-09-30 23:00:00','2017-10-30')
On the whole, this shows similar behaviour to just the number of vertices.
fullPlot('biggest','Largest Connected Component Size','Date','Largest connected Component','linear','2016-09-30 23:00:00','2018-10-30')
This seems to be similar in trend to the size of the largest (note the much smaller scale of ~600 components in total).
fullPlot('totalWithoutIslands','Number of Connected Components','Date','Total Connected Components','linear','2016-09-30 23:00:00','2018-10-30')
The plot below shows a CDF of the 'proportion' data for different window sizes, with particular attention to sizes between an hour and a day. Would be maybe helpful to provide a takeaway statistic like "for all window sizes, there's a component of size x% for y% of the time.
def proportionWindowCDFPlot(toPlot,title,x,y,scale,start,end):
num_bins = 100
windows = [31536000000,2592000000,604800000,86400000]
windows2 = [43200000,21600000,14400000,7200000]
labels = ["Year Window",'Month Window','Week Window','Day Window']
labels2 = ["12 Hour Window","6 Hour Window","4 Hour Window","2 Hour Window"]
a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
plt.title(title,fontsize=20)
plt.xlabel(x,fontsize=16)
plt.ylabel(y,fontsize=16)
ax.set_yscale(scale)
with open('CC/nowindow.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x['time'] = pd.to_datetime(x['time'],unit='ms')
x['index'] = pd.to_datetime(x['time'],unit='ms')
x.set_index('index', inplace=True)
x =x[start:end]
x = x[x['total'] != 0]
x['mean'] = x[toPlot].rolling(window=4,center=False).mean()
counts, bin_edges = np.histogram (x[toPlot], bins=num_bins, normed=False)
cdf = np.cumsum (counts)
l,=plt.plot (bin_edges[1:], cdf/cdf[-1])
l.set_label("No window")
with open('CC/bigCC.json') as json_file:
cc1 = json.load(json_file)
cc1= pd.DataFrame(cc1['views'])
cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
cc1.set_index('index', inplace=True)
cc1=cc1[start:end]
cc1 = cc1[cc1[toPlot] != 0]
index = 0
for i in windows:
y = cc1[cc1['windowsize'] == i]
y['mean'] = y[toPlot].rolling(window=4,center=False).mean()
hours = str(int((i/3600000)))+" hour window"
counts, bin_edges = np.histogram (y[toPlot], bins=num_bins, normed=False)
cdf = np.cumsum (counts)
l,=plt.plot (bin_edges[1:], cdf/cdf[-1])
l.set_label(labels[index])
index +=1
with open('CC/ccwindowsecondset.json') as json_file:
cc1 = json.load(json_file)
cc1= pd.DataFrame(cc1['views'])
cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
cc1.set_index('index', inplace=True)
cc1=cc1[start:end]
cc1 = cc1[cc1[toPlot] != 0]
index = 0
for i in windows2:
y = cc1[cc1['windowsize'] == i]
y['mean'] = y[toPlot].rolling(window=4,center=False).mean()
hours = str(int((i/3600000)))+" hour window"
counts, bin_edges = np.histogram (y[toPlot], bins=num_bins, normed=False)
cdf = np.cumsum (counts)
l,=plt.plot (bin_edges[1:], cdf/cdf[-1])
l.set_label(labels2[index])
index +=1
with open('CC/cc1hour.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x['index'] = pd.to_datetime(x['time'],unit='ms')
x['time'] = pd.to_datetime(x['time'],unit='ms')
x.set_index('index', inplace=True)
x =x[start:end]
x = x[x['total'] != 0]
x['hour'] = x.apply(lambda row: int(row['time'].hour),axis=1)
counts, bin_edges = np.histogram (x[toPlot], bins=num_bins, normed=False)
cdf = np.cumsum (counts)
plt.plot(bin_edges[1:], cdf/cdf[-1],label="Hour window")
#x.plot(x='time', y=toPlot,ax=ax, color="#f0134d",)
ax.legend(fontsize=16)
plt.savefig("Graphs/ConnectedComponentsCDF.png")
plt.show()
proportionWindowCDFPlot('proportion','Biggest Connected Components % of graph','Proportion of the graph',
'Proportion of Time','linear','2016-09-01', '2018-05-03')
Batching the data by hour of the day we can see some diurnal behaviour as it's a mostly US-based platform.
Seaborns documentation: parameter whis controlling how far whiskers stretch. Upper whisker stretches to the furthest datapoint within [UQ, UQ + whis*IQR]
def diurnal_plot(toPlot, title, x, y, scale,start,end):
windows = [31536000000,2592000000,604800000,86400000]
#start = 1483228800000
no_weeks = 10
a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(1,1,figsize=a4_dims)
#ax[0].set_title(title,fontsize=20)
#ax.set_title("Boxplot of "+title,fontsize=30)
#ax[0].set_xlabel(x,fontsize=20)
ax.set_xlabel(x,fontsize=30)
#ax[0].set_ylabel(y,fontsize=20)
ax.set_ylabel(y,fontsize=30)
#ax[0].set_yscale(scale)
ax.set_yscale(scale)
dfs = {}
with open('CC/cc1hour.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x['index'] = pd.to_datetime(x['time'],unit='ms')
x['time'] = pd.to_datetime(x['time'],unit='ms')
x.set_index('index', inplace=True)
x =x[start:end]
x = x.set_index('time')
x['Weekday Name'] = x.index.weekday_name
x['Hour'] = x.index.hour
#print(x.dtypes)
x = x[x['total'] != 0]
means = x.groupby(x.index.hour).mean()
sds = x.groupby(x.index.hour).std()
#ax[0].plot(means['Hour'],means[toPlot])
#ax[0].fill_between(means['Hour'], means[toPlot]-sds[toPlot]/2.0, means[toPlot]+sds[toPlot]/2.0, alpha=0.3)
ax=sns.boxplot(data=x, x='Hour', y=toPlot)
plt.xlabel("Hour (UTC)")
plt.rc('xtick',labelsize=20)
plt.rc('ytick',labelsize=20)
plt.ylabel(y,fontsize=30)
plt.tight_layout()
plt.savefig("Graphs/"+y.replace(" ","_")+".png")
plt.show()
diurnal_plot('proportion','Largest Connected Component % of graph','Hour','Proportion of Graph','linear','2016-11-30', '2018-10-30')
diurnal_plot('biggest','Largest Connected Component Size','Hour','Largest connected Component','linear','2016-11-30','2018-10-30')
diurnal_plot('totalWithoutIslands','Total Connected Components','Hour','Total Connected Components','linear','2016-11-30','2018-10-30')
This section explores deeper the existence of periodic/diurnal behaviour in the size/proportion of giant component size using fourier analysis. We will look first at the proportion data.
The fourier transform and power spectrum of the shows peaks at the 24hr, 12hr, and 6hr frequencies.
# Look at power spectrum of proportion data
from scipy import fftpack, signal
with open('CC/cc1hour.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x['index'] = pd.to_datetime(x['time'],unit='ms')
x['time'] = pd.to_datetime(x['time'],unit='ms')
x.set_index('index', inplace=True)
json_file.close()
data_prop = signal.detrend(np.array(x['proportion']),)
# plot original data (shifted by mean)
plt.plot(range(len(data_prop)),data_prop)
plt.title('Plot of original data')
plt.show()
# get fourier series
fs = 168
x = fftpack.fft(data_prop)
# x is a vector of complex number eigenvalues
freqs = fftpack.fftfreq(len(data_prop))*fs
fig, ax = plt.subplots()
ax.stem(freqs, np.abs(x))
ax.set_xlabel('Frequency (times per '+str(fs)+'hrs)')
ax.set_xlim(0,0.5*fs)
ax.set_ylabel('Frequency Domain (Spectrum) Magnitude')
plt.title('Fourier transform of hourly data')
plt.tight_layout()
plt.show()
# Compute power spectrum for clearer picture
freqs, P_xx = signal.periodogram(data_prop, fs, scaling = 'density')
plt.plot(freqs, P_xx)
plt.title('Power spectrum of data')
plt.show()
Below is the inverse Fourier transform (IFT) of the first few frequencies, showing the 'hum'.
# Compute inverse fourier transform of first few frequencies.
tmp = np.zeros(len(x))
tmp[:20]=x[:20]
IFT = fftpack.ifft(tmp)
plt.plot(range(len(data_prop)),data_prop, color='black')
plt.plot(range(len(data_prop)),IFT,color='red')
plt.show()
The IFT of the largest magnitude 10 frequencies looks fairly as expected, looking like a sine wave with 24h period.
#First 10 harmonics against real data for 1 week
ix_full = np.argsort(-1*np.absolute(x))
res_full = np.absolute(x[ix_full])
tmp = np.zeros(len(x),dtype=np.complex)
tmp[ix_full[:10]]=x[ix_full[:10]]
res1 = np.fft.ifft(tmp)
plt.figure(figsize=(8,5))
plt.plot(range(168),data_prop[1000:1168],color='black')
plt.plot(range(168),res1[1000:1168],color='red',linewidth=3)
plt.show()
If we allow the IFT of the largest 240 harmonics we actually see a second slightly smaller peak slightly before a bigger peak, suggesting the presence of a European userbase.
#First 240 harmonics
tmp = np.zeros(len(x), dtype=np.complex)
tmp[ix_full[:240]]=x[ix_full[:240]]
res2 = np.fft.ifft(tmp)
plt.plot(data_prop[1000:1168],color='black')
plt.plot(np.real(res2[1000:1168]),color='red',linewidth=3)
plt.show()
The following plot is the red line in the previous plot plotted against a 4hr behind version of itself -- it doesn't really add anything but I just want to keep the picture for it somewhere as it's quite pretty!
# Look at the 4-lagged version of cleaned signal
res1_lagged = np.roll(np.real(res1),4)
plt.plot(np.real(res1), res1_lagged, linewidth=0.1)
plt.show()
This following subsection definitely needs more attention from me but is essentially doing the same process but for the absolute size of the LCC, and shows that if we look at the absolute size rather than proportion, we see not just a 24hr frequency but a weekly frequency (component is smaller at weekends)/
# Let's do some the same process but this time with the "size of LCC" data
with open('CC/cc1hour.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x['index'] = pd.to_datetime(x['time'],unit='ms')
x['time'] = pd.to_datetime(x['time'],unit='ms')
x.set_index('index', inplace=True)
json_file.close()
data_biggest = signal.detrend(np.array(x['biggest']))
plt.plot(range(len(data_biggest)),data_biggest)
plt.title('Plot of original data (detrended)')
plt.show()
# get fourier series
fs = 168
y = fftpack.fft(data_biggest)
# x is a vector of complex number eigenvalues
freqs = fftpack.fftfreq(len(data_biggest))*fs
fig, ax = plt.subplots()
ax.stem(freqs, np.abs(y))
ax.set_xlabel('Frequency (times per '+str(fs)+'hrs)')
ax.set_xlim(0,0.5*fs)
ax.set_ylabel('Frequency Domain (Spectrum) Magnitude')
plt.title('Fourier transform of hourly data')
plt.tight_layout()
plt.show()
# Compute power spectrum for clearer picture
freqs, P_xx = signal.periodogram(data_biggest, fs, scaling = 'density')
plt.plot(freqs, P_xx)
plt.title('Power spectrum of data')
plt.show()
# Compute inverse fourier transform of first few frequencies.
tmp = np.zeros(len(y))
tmp[:20]=y[:20]
IFT = fftpack.ifft(tmp)
plt.plot(range(len(data_biggest)),data_biggest, color='black')
plt.plot(range(len(data_biggest)),IFT,color='red')
plt.show()
#First 10 harmonics against real data for 1 week
ix_full = np.argsort(-1*np.absolute(y))
res_full = np.absolute(y[ix_full])
tmp = np.zeros(len(y),dtype=np.complex)
tmp[ix_full[:50]]=y[ix_full[:50]]
res1 = np.fft.ifft(tmp)
plt.figure(figsize=(8,5))
plt.plot(range(168),data_biggest[1000:1168],color='black')
plt.plot(range(168),res1[1000:1168],color='red',linewidth=3)
plt.show()
#First 240 harmonics
tmp = np.zeros(len(y), dtype=np.complex)
tmp[ix_full[:240]]=y[ix_full[:240]]
res2 = np.fft.ifft(tmp)
plt.plot(data_biggest[10000:10720],color='black')
plt.plot(np.real(res2[10000:10720]),color='red',linewidth=2)
plt.show()
# See if lagged series without harmonics is still correlated.
without_harmonics = data_biggest - np.real(res2)
plt.stem(range(24), acf(without_harmonics,24))
plt.show()
# Look at the 4-lagged version of cleaned signal
res1_lagged = np.roll(np.real(res2),4)
plt.plot(np.real(res2), res1_lagged, linewidth=0.1)
plt.show()
Keep the same links but reorder their timestamps randomly, so that the rate of edge activity is conserved and that the aggregate graph is identical. For more detail on this null model, take a look at Temporal Networks, P. Holme, J. Saramaki (2011) under the heading Randomly Permuted Times (p17).
We find that the value for the proportion is always smaller in the shuffled timestamps case than the real data, only slightly so for window sizes greater than a day, but largely so for the hour window. My thoughts are that this is due to the 'memory effect'/'edge persistence' in the real data, i.e. pairwise interactions are fairly bursty, and the chance of an interaction between two users decreases the longer it's been since the last interaction. In this way, we might expect to see a higher number of unique nodes by randomly sampling a number of edges throughout the whole time period than sampling the same number of edges but within a small time slice (i.e. a larger denominator in the 'proportion' for the shuffled than for unshuffled).
def nullComparePlot(toPlot,x,y,scale,start,end,window,lims):
windows = {"Year": 31536000000,"Month": 2592000000, "Week": 604800000, "Day": 86400000}
a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
#plt.title(title,fontsize=30)
plt.xlabel(x,fontsize=30)
plt.ylabel(y,fontsize=30)
ax.set_yscale(scale)
plt.ylim(lims)
if window=="Hour":
with open('CC/cc1hour.json') as json_file:
cc1 = json.load(json_file)
cc1= pd.DataFrame(cc1['views'])
cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
cc1.set_index('index', inplace=True)
cc1=cc1[start:end]
cc1 = cc1[cc1[toPlot] != 0]
cc1.plot(x='time', y=toPlot,ax=ax, label="Real Data")
with open('CC/sortedCChour.json') as json_file:
cc1 = json.load(json_file)
cc1= pd.DataFrame(cc1['views'])
cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
cc1.set_index('index', inplace=True)
cc1=cc1[start:end]
cc1 = cc1[cc1[toPlot] != 0]
cc1.plot(x='time', y=toPlot,ax=ax, label="Shuffled Timestamps")
elif window =="4 Hour":
with open('CC/ccwindowsecondset.json') as json_file:
cc1 = json.load(json_file)
cc1= pd.DataFrame(cc1['views'])
cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
cc1.set_index('index', inplace=True)
cc1=cc1[start:end]
cc1 = cc1[cc1[toPlot] != 0]
y = cc1[cc1['windowsize']==14400000]
y.plot(x='time', y=toPlot,ax=ax, label="Real Data")
with open('CC/sortedhourslotcc.json') as json_file:
cc1 = json.load(json_file)
cc1= pd.DataFrame(cc1['views'])
cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
cc1.set_index('index', inplace=True)
cc1=cc1[start:end]
cc1 = cc1[cc1[toPlot] != 0]
y = cc1[cc1['windowsize']==14400000]
y.plot(x='time', y=toPlot,ax=ax, label="Shuffled Timestamps")
elif window =="6 Hour":
with open('CC/ccwindowsecondset.json') as json_file:
cc1 = json.load(json_file)
cc1= pd.DataFrame(cc1['views'])
cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
cc1.set_index('index', inplace=True)
cc1=cc1[start:end]
cc1 = cc1[cc1[toPlot] != 0]
y = cc1[cc1['windowsize']==21600000]
y.plot(x='time', y=toPlot,ax=ax, label="Real Data")
with open('CC/sortedhourslotcc.json') as json_file:
cc1 = json.load(json_file)
cc1= pd.DataFrame(cc1['views'])
cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
cc1.set_index('index', inplace=True)
cc1=cc1[start:end]
cc1 = cc1[cc1[toPlot] != 0]
y = cc1[cc1['windowsize']==21600000]
y.plot(x='time', y=toPlot,ax=ax, label="Shuffled Timestamps")
elif window =="12 Hour":
with open('CC/ccwindowsecondset.json') as json_file:
cc1 = json.load(json_file)
cc1= pd.DataFrame(cc1['views'])
cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
cc1.set_index('index', inplace=True)
cc1=cc1[start:end]
cc1 = cc1[cc1[toPlot] != 0]
y = cc1[cc1['windowsize']==43200000]
y.plot(x='time', y=toPlot,ax=ax, label="Real Data")
with open('CC/sortedhourslotcc.json') as json_file:
cc1 = json.load(json_file)
cc1= pd.DataFrame(cc1['views'])
cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
cc1.set_index('index', inplace=True)
cc1=cc1[start:end]
cc1 = cc1[cc1[toPlot] != 0]
y = cc1[cc1['windowsize']==43200000]
y.plot(x='time', y=toPlot,ax=ax, label="Shuffled Timestamps")
else:
with open('CC/bigCC.json') as json_file:
cc1 = json.load(json_file)
cc1= pd.DataFrame(cc1['views'])
cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
cc1.set_index('index', inplace=True)
cc1=cc1[start:end]
cc1 = cc1[cc1[toPlot] != 0]
y = cc1[cc1['windowsize'] == windows[window]]
y.plot(x='time', y=toPlot,ax=ax, label="Real Data")
with open('CC/sortedCC.json') as json_file:
cc1 = json.load(json_file)
cc1= pd.DataFrame(cc1['views'])
cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
cc1.set_index('index', inplace=True)
cc1=cc1[start:end]
cc1 = cc1[cc1[toPlot] != 0]
y = cc1[cc1['windowsize'] == windows[window]]
y.plot(x='time', y=toPlot,ax=ax, label="Shuffled Timestamps")
ax.legend(fontsize=20,framealpha=0.9,loc='lower right')
plt.tight_layout()
plt.rc('xtick',labelsize=20)
plt.rc('ytick',labelsize=20)
plt.setp(ax.xaxis.get_majorticklabels(), rotation=30)
plt.xlabel("Date")
nullComparePlot('proportionWithoutIslands','Date','Proportion','linear','2017-09-30 23:00:00','2017-10-30','Hour',[0.0,1.0])
nullComparePlot('proportionWithoutIslands','Date','Proportion','linear','2017-09-30 23:00:00','2017-10-30','4 Hour',[0.0,1.0])
nullComparePlot('proportionWithoutIslands','Date','Proportion','linear','2017-09-30 23:00:00','2017-10-30','6 Hour',[0.0,1.0])
nullComparePlot('proportionWithoutIslands','Date','Proportion','linear','2016-09-30 23:00:00','2018-10-30','Day',[0.6,1.0])
nullComparePlot('proportionWithoutIslands','Date','Proportion','linear','2016-09-30 23:00:00','2018-10-30','Week',[0.6,1.0])
nullComparePlot('proportionWithoutIslands','Date','Proportion','linear','2016-09-30 23:00:00','2018-10-30','Month',[0.6,1.0])
nullComparePlot('proportionWithoutIslands','Date','Proportion','linear','2016-09-30 23:00:00','2018-10-30','Year',[0.6,1.0])
For this part we instead plot the absolute size of the largest connected component for different windows. Like the proportion, the LCC size is smaller for the shuffled data than the real in the hour window, but confusingly this order switches going up to the day, week and month windows.
For the day, week and month window, I suspect that 'memory effect' might explain this too, in that you're more likely to sample 'weak ties' if sampling from the whole time period as opposed to the same number but from a small time interval.
nullComparePlot('biggest','Date','Size of LCC','linear','2017-09-30 23:00:00','2017-10-30','Hour',[0,450])
nullComparePlot('biggest','Date','Size of LCC','linear','2017-09-30 23:00:00','2017-10-30','4 Hour',[0,1000])
nullComparePlot('biggest','Date','Size of LCC','linear','2017-09-30 23:00:00','2017-10-30','6 Hour',[0,1500])
nullComparePlot('biggest','Date','Size of LCC','linear','2017-09-30 23:00:00','2017-10-30','12 Hour',[0,2000])
nullComparePlot('biggest','Date','Size of LCC','linear','2016-09-30 23:00:00','2018-10-30','Day',[0,10000])
nullComparePlot('biggest','Date','Size of LCC','linear','2016-09-30 23:00:00','2018-10-30','Week',[0,50000])
nullComparePlot('biggest','Date','Size of LCC','linear','2016-09-30 23:00:00','2018-10-30','Month',[0,50000])
nullComparePlot('biggest','Date','Size of LCC','linear','2016-09-30 23:00:00','2018-10-30','Year',[0,100000])
We compare also (on a smaller time interval because of time constraints!) the size of the LCC for the real data with that of an Erdos-Renyi random graph with the same number of nodes and edges.
Specifically we generate, for each window size and time, a graph with the same number of nodes and edges as the real data, with the edges assigned at random, and compare the size of the LCC of this with the real data.
For each window size, the LCC is consistently overestimated by the E-R model, which may be due to some strong underlying community structure, whereby the second, third etc largest connected components in the real data are non-negligible in size.
# Size of largest connected component compared to expected in an E-R Graph
a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
#plt.title(title,fontsize=30)
import networkx as nx
def get_expected_size(n,m):
sizes=np.zeros(5)
for i in range(5):
G = nx.gnm_random_graph(n, m)
largest = max(nx.connected_component_subgraphs(G), key=len)
sizes[i]=len(largest)
return np.mean(sizes)
start, end = '2017-9-23 23:00:00','2017-09-30 23:00:00'
with open('degree/degreehours.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x['index'] = pd.to_datetime(x['time'],unit='ms')
x['time'] = pd.to_datetime(x['time'],unit='ms')
x.set_index('index', inplace=True)
x =x[start:end]
x['mean'] =x.apply(lambda row : get_expected_size(row['vertices'],row['edges']),axis=1)
#plt.fill_between(x['time'],np.array(x['mean'])-np.array(x['sd']), np.array(x['mean'])+np.array(x['sd']),alpha=0.3)
#print(x['expected'])
ax.plot(x['time'],x['mean'],label='Expected size of LCC')
json_file.close()
with open('CC/cc1hour.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x['index'] = pd.to_datetime(x['time'],unit='ms')
x['time'] = pd.to_datetime(x['time'],unit='ms')
x.set_index('index', inplace=True)
x =x[start:end]
x.plot(x='time',y='biggest',ax=ax, label='Size of LCC')
json_file.close()
plt.xlabel('Time',fontsize=30)
plt.rc('xtick',labelsize=20)
plt.rc('ytick',labelsize=20)
plt.legend(fontsize=20)
plt.savefig("Graphs/largest_vs_expected_hour.png")
plt.show()
a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
#plt.title(title,fontsize=30)
with open('degree/degreewindows.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x=x[x['windowsize']==86400000]
x['index'] = pd.to_datetime(x['time'],unit='ms')
x['time'] = pd.to_datetime(x['time'],unit='ms')
x.set_index('index', inplace=True)
x =x[start:end]
x['mean'] =x.apply(lambda row : get_expected_size(row['vertices'],row['edges']),axis=1)
#print(x['expected'])
#ax.fill_between(x['time'],np.array(x['mean'])-np.array(x['sd']), np.array(x['mean'])+np.array(x['sd']),alpha=0.3)
ax.plot(x['time'],x['mean'],label='Expected size of LCC')
json_file.close()
with open('CC/bigCC.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x=x[x['windowsize']==86400000]
x['index'] = pd.to_datetime(x['time'],unit='ms')
x['time'] = pd.to_datetime(x['time'],unit='ms')
x.set_index('index', inplace=True)
x =x[start:end]
x.plot(x='time',y='biggest',ax=ax, label='Size of LCC')
json_file.close()
plt.xlabel('Time',fontsize=30)
plt.rc('xtick',labelsize=20)
plt.rc('ytick',labelsize=20)
plt.legend(fontsize=20)
plt.show()
a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
#plt.title(title,fontsize=30)
ax.set_yscale('linear')
with open('degree/degreewindows.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x=x[x['windowsize']==604800000]
x['index'] = pd.to_datetime(x['time'],unit='ms')
x['time'] = pd.to_datetime(x['time'],unit='ms')
x.set_index('index', inplace=True)
x =x[start:end]
x['expected']=x.apply(lambda row : get_expected_size(row['vertices'],row['edges']),axis=1)
#print(x['expected'])
x.plot(x='time',y='expected',ax=ax,label='Expected size of LCC')
json_file.close()
with open('CC/bigCC.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x=x[x['windowsize']==604800000]
x['index'] = pd.to_datetime(x['time'],unit='ms')
x['time'] = pd.to_datetime(x['time'],unit='ms')
x.set_index('index', inplace=True)
x =x[start:end]
x.plot(x='time',y='biggest',ax=ax, label='Size of LCC')
json_file.close()
plt.xlabel('Time',fontsize=30)
plt.rc('xtick',labelsize=20)
plt.rc('ytick',labelsize=20)
plt.legend(fontsize=20)
plt.show()
a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
#plt.title(title,fontsize=30)
plt.xlabel('Time',fontsize=20)
ax.set_yscale('linear')
with open('degree/degreewindows.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x=x[x['windowsize']==2592000000]
x['index'] = pd.to_datetime(x['time'],unit='ms')
x['time'] = pd.to_datetime(x['time'],unit='ms')
x.set_index('index', inplace=True)
x =x[start:end]
x['expected']=x.apply(lambda row : get_expected_size(row['vertices'],row['edges']),axis=1)
#print(x['expected'])
x.plot(x='time',y='expected',ax=ax,label='Expected size of LCC')
json_file.close()
with open('CC/bigCC.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x=x[x['windowsize']==2592000000]
x['index'] = pd.to_datetime(x['time'],unit='ms')
x['time'] = pd.to_datetime(x['time'],unit='ms')
x.set_index('index', inplace=True)
x =x[start:end]
x.plot(x='time',y='biggest',ax=ax, label='Size of LCC')
json_file.close()
plt.xlabel('Time',fontsize=30)
plt.rc('xtick',labelsize=20)
plt.rc('ytick',labelsize=20)
plt.legend(fontsize=20)
plt.show()
For each window size and window, we obtain the top 20 users in terms of in-degree. Is it the case that some users dominate for long periods, or is it more dynamic?
For two sets A and B, the Jaccard similarity is given by |A n B| / |A u B| measuring their percentage overlap.
def jaccard_similarity(list1, list2):
list1, list2 = set(list1), set(list2)
intersection_size = len(list1.intersection(list2))
union_size=len(list1)+len(list2)-intersection_size
if union_size==0:
return 0.0
else: return intersection_size/union_size
For each window size, we calculate the Jaccard similarity between the pairs of consecutive non-overlapping windows' top 20 users. The lower plot shows the mean JS for each window size, but as the error bars overlap and as the number of datapoints drops vastly for each window theres nothing much concrete we can say about it yet.
def get_users(cell):
if len(cell)==0:
return []
return pd.DataFrame(cell)['id']
def rank_jaccard_fast(x, y, title, scale):
windows = [31536000000,2592000000,604800000,86400000]
labels = ['Year Window','Month Window','Week Window','Day Window', 'Hour Window']
a4_dims = (11.7, 8.27)
day_length=86400000
fig, ax = plt.subplots(figsize=a4_dims)
jaccard=pd.DataFrame({'Window Size':[], 'Jaccard':[]})
#plt.title(title, size=30)
plt.xlabel(x, size=20)
plt.ylabel(y, size=20)
ax.set_yscale(scale)
means=np.zeros(5)
sds=np.zeros(5)
with open('degree/degreewindows.json') as json_file:
degs = json.load(json_file)
json_file.close()
degs = pd.DataFrame(degs['views'])
degs['time'] = pd.to_datetime(degs['time'],unit='ms')
degs['index'] = pd.to_datetime(degs['time'],unit='ms')
degs.set_index('index', inplace=True)
index = 0
for i in windows:
n = round(i/day_length)
y = degs[degs['windowsize'] == i]
y = y.iloc[::n,:]
y['topusers']=y.apply(lambda row: get_users(row['bestusers']),axis=1)
y['nexttop'] = y['topusers'].shift(1)
y['nexttop'][0]=[]
y['jaccard']= y.apply(lambda x: 0.0, axis=1)
y['jaccard'] = y.apply(lambda row: jaccard_similarity(row['topusers'],row['nexttop']), axis=1)
means[index]=np.mean(y['jaccard'])
sds[index]=np.std(y['jaccard'])
jaccard['Time']=y['time']
jaccard[labels[index]]=y['jaccard']
ax.plot(y['time'],y['jaccard'], label=labels[index])
index +=1
jaccard.set_index('Time',inplace=True)
with open('degree/degreehours.json') as json_file:
degs = json.load(json_file)
json_file.close()
degs = pd.DataFrame(degs['views'])
degs['time'] = pd.to_datetime(degs['time'],unit='ms')
degs['index'] = pd.to_datetime(degs['time'],unit='ms')
degs.set_index('index', inplace=True)
y = degs
y['topusers']=y.apply(lambda row: get_users(row['bestusers']),axis=1)
y['nexttop'] = y['topusers'].shift(1)
y['nexttop'][0]=[]
y['jaccard']= y.apply(lambda x: 0.0, axis=1)
y['jaccard'] = y.apply(lambda row: jaccard_similarity(row['topusers'],row['nexttop']), axis=1)
means[index]=np.mean(y['jaccard'])
sds[index]=np.std(y['jaccard'])
jaccard['Time']=y['time']
jaccard[labels[index]]=y['jaccard']
ax.plot(y['time'],y['jaccard'], label=labels[index])
print(means)
print(sds)
plt.legend(fontsize=20, loc='lower right')
plt.setp(ax.xaxis.get_majorticklabels(), rotation=30)
plt.tight_layout()
plt.savefig('graphs/JaccardSimilarity.png')
plt.show()
fig, ax = plt.subplots(figsize=a4_dims)
windows = [31536000000,2592000000,604800000,86400000,3600000]
windows.reverse()
new_means = np.flip(means)
new_sds = np.flip(sds)
labels.reverse()
print(labels)
ax.bar(np.arange(5), new_means, yerr=new_sds, align='center', alpha=0.5, ecolor='black')
plt.xlabel('Window Size',size=20)
ax.set_xticks(np.arange(5))
ax.set_xticklabels(labels,size=20,rotation=30)
plt.ylabel('Similarity between consecutive windows', size=20)
plt.savefig('Graphs/JaccardMeanSD.png')
plt.show()
rank_jaccard_fast("Date", "Similarity", "Jaccard similarity index of consecutive node rankings", 'linear')
As with the connected components, we look at the effect of randomly shuffling the timestamps. It seems to have a "smoothing effect", suggesting that in the original data, many of the users who reach the top 20 may only do so for a short period of time.
def rank_jaccard_shuffled(x, y, title, scale):
windows = [31536000000,2592000000,604800000,86400000]
labels = ['Year Window','Month Window','Week Window','Day Window', 'Hour Window']
a4_dims = (11.7, 8.27)
day_length=86400000
fig, ax = plt.subplots(figsize=a4_dims)
jaccard=pd.DataFrame({'Window Size':[], 'Jaccard':[]})
#plt.title(title, size=30)
plt.xlabel(x, size=20)
plt.ylabel(y, size=20)
ax.set_yscale(scale)
means=np.zeros(5)
sds=np.zeros(5)
with open('degree/degreesorted.json') as json_file:
degs = json.load(json_file)
json_file.close()
degs = pd.DataFrame(degs['views'])
degs['time'] = pd.to_datetime(degs['time'],unit='ms')
degs['index'] = pd.to_datetime(degs['time'],unit='ms')
degs.set_index('index', inplace=True)
index = 0
for i in windows:
n = round(i/day_length)
y = degs[degs['windowsize'] == i]
y = y.iloc[::n,:]
y['topusers']=y.apply(lambda row: get_users(row['bestusers']),axis=1)
y['nexttop'] = y['topusers'].shift(1)
y['nexttop'][0]=[]
y['jaccard']= y.apply(lambda x: 0.0, axis=1)
y['jaccard'] = y.apply(lambda row: jaccard_similarity(row['topusers'],row['nexttop']), axis=1)
means[index]=np.mean(y['jaccard'])
sds[index]=np.std(y['jaccard'])
jaccard['Time']=y['time']
jaccard[labels[index]]=y['jaccard']
ax.plot(y['time'],y['jaccard'], label=labels[index])
index +=1
jaccard.set_index('Time',inplace=True)
with open('degree/degreesortedhour.json') as json_file:
degs = json.load(json_file)
json_file.close()
degs = pd.DataFrame(degs['views'])
degs['time'] = pd.to_datetime(degs['time'],unit='ms')
degs['index'] = pd.to_datetime(degs['time'],unit='ms')
degs.set_index('index', inplace=True)
y = degs
y['topusers']=y.apply(lambda row: get_users(row['bestusers']),axis=1)
y['nexttop'] = y['topusers'].shift(1)
y['nexttop'][0]=[]
y['jaccard']= y.apply(lambda x: 0.0, axis=1)
y['jaccard'] = y.apply(lambda row: jaccard_similarity(row['topusers'],row['nexttop']), axis=1)
means[index]=np.mean(y['jaccard'])
sds[index]=np.std(y['jaccard'])
jaccard['Time']=y['time']
jaccard[labels[index]]=y['jaccard']
ax.plot(y['time'],y['jaccard'], label=labels[index])
print(means)
print(sds)
plt.legend(fontsize=20, loc='lower right')
plt.setp(ax.xaxis.get_majorticklabels(), rotation=30)
plt.tight_layout()
plt.savefig('graphs/JaccardSimilarityShuffled.png')
plt.show()
fig, ax = plt.subplots(figsize=a4_dims)
windows = [31536000000,2592000000,604800000,86400000,3600000]
windows.reverse()
new_means = np.flip(means)
new_sds = np.flip(sds)
labels.reverse()
print(labels)
ax.bar(np.arange(5), new_means, yerr=new_sds, align='center', alpha=0.5, ecolor='black')
plt.xlabel('Window Size',size=20)
ax.set_xticks(np.arange(5))
ax.set_xticklabels(labels,size=20,rotation=30)
plt.ylabel('Similarity between consecutive windows', size=20)
plt.savefig('Graphs/JaccardMeanSDShuffled.png')
plt.show()
rank_jaccard_shuffled("Date", "Similarity", "Jaccard similarity index of consecutive node rankings", 'linear')
We also compute the similarity (for the real dataset) between top 20 users in each window size and the top 20 all-time top users at a reference point around Nov 16. We find:
def get_users(cell):
if len(cell)==0:
return []
return pd.DataFrame(cell)['id']
def rank_jaccard_year_comparison(x, y, title, scale):
windows = [31536000000,2592000000,604800000,86400000]
labels = ['Year Window','Month Window','Week Window','Day Window']
a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
jaccard=pd.DataFrame({'Window Size':[], 'Jaccard':[]})
#plt.title(title, size=30)
plt.xlabel(x, size=30)
plt.ylabel(y, size=30)
ax.set_yscale(scale)
with open('degree/degreewindows.json') as json_file:
degs = json.load(json_file)
json_file.close()
degs = pd.DataFrame(degs['views'])
degs['time'] = pd.to_datetime(degs['time'],unit='ms')
degs['index'] = pd.to_datetime(degs['time'],unit='ms')
degs.set_index('index', inplace=True)
#get year comparison:
year = degs[degs['windowsize']==31536000000]
jan17top = get_users(year['bestusers']['2017-01-01 23:00:00'])
index=0
for i in windows:
print(i)
y = degs[degs['windowsize'] == i]
y['topusers']=y.apply(lambda row: get_users(row['bestusers']),axis=1)
y['jaccard']= y.apply(lambda x: 0.0, axis=1)
y['jaccard'] = y.apply(lambda row: jaccard_similarity(row['topusers'],jan17top), axis=1)
ax.plot(y['time'],y['jaccard'], label=labels[index])
index +=1
plt.legend(fontsize=20, loc='upper right')
plt.rc('xtick',labelsize=20)
plt.rc('ytick',labelsize=20)
plt.setp(ax.xaxis.get_majorticklabels(), rotation=30)
plt.tight_layout()
plt.savefig('graphs/JaccardSimilarityJan17.png')
plt.show()
rank_jaccard_year_comparison('Date','Similarity','Similarity','linear')
For each user who has ever been in the top 20 in any window, we count the number of windows (for each size) in which they appear in the top 20. For example, 90% of these users appear in the daily top 20 for less than 10% of the time period. Needs ironing out a bit I think as the explanation takes me ages to get my head around!
from collections import Counter
from itertools import chain
def how_many_windows(x,y,title,scale):
windows = [31536000000,2592000000,604800000,86400000]
labels = ['Year Window','Month Window','Week Window','Day Window']
a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
#plt.title(title, size=30)
plt.xlabel(x, size=20)
plt.ylabel(y, size=20)
ax.set_yscale(scale)
with open('degree/degreewindows.json') as json_file:
degs = json.load(json_file)
json_file.close()
degs = pd.DataFrame(degs['views'])
degs['time'] = pd.to_datetime(degs['time'],unit='ms')
degs['index'] = pd.to_datetime(degs['time'],unit='ms')
degs.set_index('index', inplace=True)
index = 0
for i in windows:
y = degs[degs['windowsize'] == i]
total_windows = len(y)
y['topusers']=y.apply(lambda row: get_users(row['bestusers']),axis=1)
users_count = Counter(chain.from_iterable(set(row) for row in y['topusers']))
meta = Counter(users_count.values())
x1,y1 = zip(*sorted(meta.items()))
x1 = np.array(x1)/total_windows
y1 = np.cumsum(np.array(y1))/len(users_count.items())
cdf = 1-y1
ax.plot(x1,y1, label=labels[index])
index+=1
plt.legend()
plt.tight_layout()
plt.show()
how_many_windows('Proportion of windows','CDF of users in top 20 for that proportion','lol','linear')
windows = [31536000000,2592000000,604800000,86400000]
with open('bigCC.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x['time'] = pd.to_datetime(x['time'],unit='ms')
x = x[x['total'] != 0]
a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
plt.title('Biggest Connected Components % of graph')
plt.xlabel('Date')
plt.ylabel('Proportion of Graph')
for i in windows:
y = x[x['windowsize'] == i]
y['mean'] = y['proportion'].rolling(window=4,center=False).mean()
hours = str(int((i/3600000)))+" hour window"
y.plot(x='time', y='mean',ax=ax, label=hours)
plt.show()
windows = [31536000000,2592000000,604800000,86400000]
y=0
x=0
z=0
a=0
with open('bigCC.json') as json_file:
x = json.load(json_file)
x= pd.DataFrame(x['views'])
x['time'] = pd.to_datetime(x['time'],unit='ms')
x = x[x['total'] != 0]
with open('bigCC2.json') as json_file:
y = json.load(json_file)
y= pd.DataFrame(y['views'])
y['time'] = pd.to_datetime(y['time'],unit='ms')
y = y[y['total'] != 0]
with open('bigCC3.json') as json_file:
z = json.load(json_file)
z = pd.DataFrame(z['views'])
z['time'] = pd.to_datetime(z['time'],unit='ms')
z = z[z['total'] != 0]
with open('bigCC4.json') as json_file:
a = json.load(json_file)
a = pd.DataFrame(a['views'])
a['time'] = pd.to_datetime(z['time'],unit='ms')
a = a[a['total'] != 0]
a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
plt.title('Time taken to process view')
plt.xlabel('Date')
plt.ylabel('Time in Milliseconds')
x.plot(x='time', y='viewTime',ax=ax, label="Full view with all windows")
y.plot(x='time', y='viewTime',ax=ax, label="Full view with all windows with caching")
z.plot(x='time', y='viewTime',ax=ax, label="Full view with all windows Parallel")
a.plot(x='time', y='viewTime',ax=ax, label="Full view with all windows vote")
plt.show()
def percentagePlot(toPlot,title,x,y,scale,start,end):
windows = [31536000000,2592000000,604800000, 86400000]
labels = ['Year Window','Month Window','Week Window','Day Window']
a4_dims = (11.7, 8.27)
fig, ax = plt.subplots(figsize=a4_dims)
#plt.title(title,fontsize=30)
plt.xlabel(x,fontsize=30)
plt.ylabel(y,fontsize=30)
ax.set_yscale(scale)
color=next(ax._get_lines.prop_cycler)['color']
with open('degree/degreewindows.json') as json_file:
with open('degree/degreesorted.json') as json_2:
cc1 = json.load(json_file)
cc2 = json.load(json_2)
cc1= pd.DataFrame(cc1['views'])
cc2= pd.DataFrame(cc2['views'])
cc1['time'] = pd.to_datetime(cc1['time'],unit='ms')
cc2['time'] = pd.to_datetime(cc2['time'],unit='ms')
cc1['index'] = pd.to_datetime(cc1['time'],unit='ms')
cc2['index'] = pd.to_datetime(cc2['time'],unit='ms')
cc1.set_index('index', inplace=True)
cc2.set_index('index', inplace=True)
cc1=cc1[start:end]
cc2=cc2[start:end]
index = 0
for i in windows:
color=next(ax._get_lines.prop_cycler)['color']
y = cc1[cc1['windowsize'] == i]
z = cc2[cc2['windowsize'] == i]
y['avgdeg']= np.where(y['vertices']<1, y['vertices'], 2*y['edges']/y['vertices'])
z['avgdeg']= np.where(z['vertices']<1, z['vertices'], 2*z['edges']/z['vertices'])
z['perc'] = np.where(z[toPlot]<1, 0, 100.0*y[toPlot]/z[toPlot])
#y = y[y[toPlot] != 0]
z.plot(x='time', y='perc',ax=ax, label=labels[index], color = color, linestyle="-", alpha=0.8)
index +=1
with open('degree/degreehours.json') as json_file:
with open('degree/degreesortedhour.json') as json_2:
color=next(ax._get_lines.prop_cycler)['color']
x = json.load(json_file)
y = json.load(json_2)
x= pd.DataFrame(x['views'])
y= pd.DataFrame(y['views'])
x['index'] = pd.to_datetime(x['time'],unit='ms')
y['index'] = pd.to_datetime(y['time'],unit='ms')
x['time'] = pd.to_datetime(x['time'],unit='ms')
y['time'] = pd.to_datetime(y['time'],unit='ms')
x.set_index('index', inplace=True)
y.set_index('index', inplace=True)
x =x[start:end]
y =y[start:end]
x['avgdeg']= np.where(x['vertices']<1, x['vertices'], 2*x['edges']/x['vertices'])
y['avgdeg']= np.where(y['vertices']<1, y['vertices'], 2*y['edges']/y['vertices'])
y['perc'] = np.where(y[toPlot]<1, 0, 100.0*x[toPlot]/y[toPlot])
x['hour'] = x.apply(lambda row: int(row['time'].hour),axis=1)
y['hour'] = y.apply(lambda row: int(row['time'].hour),axis=1)
x = x[x[toPlot] != 0]
y = y[y[toPlot] != 0]
y.plot(x='time', y='perc',ax=ax, color=color, label='Hour', alpha=0.3)
#plt.axvline('2016-11-09',linestyle='')
#plt.axvline('2017-08-11')
#plt.axvline('2017-08-13')
plt.legend(fontsize=20, loc='upper left')
plt.xlabel('Date',fontsize=30)
plt.rc('xtick',labelsize=20)
plt.rc('ytick',labelsize=20)
plt.savefig('graphs/'+toPlot+'.png')
plt.show()
percentagePlot('edges','Number of Vertices','Date','Percentage against shuffled','linear','2016-09-30 23:00:00','2018-10-30')